This notebook analyzes e-commerce delivery performance, focusing on efficiency, customer satisfaction, and logistical patterns through key metrics such as delivery times, distance efficiency, cost ratios, and late deliveries. It includes geospatial analysis of customer and seller locations, examining the top 10 largest sellers and product categories to uncover insights into delivery dynamics and performance across regions. Metrics like freight share, total order cost per km, and distance efficiency are explored to evaluate operational effectiveness, while delivery time shares (preprocessing, carrier, and customer) provide a breakdown of delivery stages. By integrating geospatial data and calculated metrics, the analysis highlights areas for improvement in logistics, pricing, and delivery strategies.
# @title Preliminary steps
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import logging
# Importing datasets
customers = pd.read_csv('/content/drive/MyDrive/olist_customers_dataset.csv')
geolocation = pd.read_csv('/content/drive/MyDrive/olist_geolocation_dataset.csv')
order_items = pd.read_csv('/content/drive/MyDrive/olist_order_items_dataset.csv')
order_payments = pd.read_csv('/content/drive/MyDrive/olist_order_payments_dataset.csv')
reviews = pd.read_csv('/content/drive/MyDrive/olist_order_reviews_dataset.csv')
orders = pd.read_csv('/content/drive/MyDrive/olist_orders_dataset.csv')
products = pd.read_csv('/content/drive/MyDrive/olist_products_dataset.csv')
sellers = pd.read_csv('/content/drive/MyDrive/olist_sellers_dataset.csv')
product_names = pd.read_csv('/content/drive/MyDrive/product_category_name_translation.csv')
# Setting chart style
# Suppress all warnings
warnings.filterwarnings("ignore")
# Suppress matplotlib font-related and other warnings from logging
logging.getLogger('matplotlib').setLevel(logging.ERROR)
# Set "serif" as the global font family with specific styling parameters
mpl.rcParams.update({
"font.family": "serif",
"font.serif": ["Liberation Serif"], # Define fallback fonts if Times New Roman is unavailable
"font.weight": "normal",
"axes.titlesize": 16,
"axes.labelsize": 12,
"xtick.labelsize": 10,
"ytick.labelsize": 10,
})
# Set a Viridis color palette
viridis_colors = plt.cm.viridis(np.linspace(0, 1, 10))
# Dataset info
# datasets = {
# 'customers': customers,
# 'geolocation': geolocation,
# 'order_items': order_items,
# 'order_payments': order_payments,
# 'reviews': reviews,
# 'orders': orders,
# 'products': products,
# 'sellers': sellers,
# 'product_names': product_names
# }
# for dataset_names, dataset in datasets.items():
# print('Dataset name: ', dataset_names)
# print(f'Shape of dataset: {dataset.shape}')
# column_info = []
# for column in dataset.columns:
# col_name = column
# dtype = dataset[column].dtype
# unique_values = dataset[column].nunique()
# missing_vals = dataset[column].isnull().sum()
# column_info.append(
# {'Column name': col_name,
# 'Data type': dtype,
# 'Unique values': unique_values,
# 'Missing values': missing_vals
# })
# column_info_df = pd.DataFrame(column_info)
# print(column_info_df)
# print('------------------------------------------------------------------------')
### Adding customer geolocation data to customers table
# geolocation contains more than 1M records; the same zip code has several lat/long values (could be different streets, buildings, etc.)
# however, these exact locations don't match to customers/ sellers exactly
# when matching, we take the first occurence of latitude, longitude for each zip code
geolocation_unique = geolocation.drop_duplicates(subset='geolocation_zip_code_prefix')
# merge geolocation data with customers
customers_geo = pd.merge(customers, geolocation_unique[['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']],
left_on='customer_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')
# rename customers_geo columns with prefixes 'c_'
customers_geo = customers_geo.rename(columns={'geolocation_lat': 'c_geolocation_lat', 'geolocation_lng': 'c_geolocation_lng'})
### Adding sellers geolocation data to sellers table
# merge geolocation data with sellers
sellers_geo = pd.merge(sellers, geolocation_unique[['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']],
left_on='seller_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')
# rename sellers_geo columns with prefixes 's_'
sellers_geo = sellers_geo.rename(columns={'geolocation_lat': 's_geolocation_lat', 'geolocation_lng': 's_geolocation_lng'})
### Adding order item info (e.g. product id, seller id, price, freight value) to orders table
# to orders dataset add product_id, seller_id and price and freight_value from order_items
orders_merge_items = pd.merge(orders, order_items[['order_id', 'product_id', 'seller_id', 'price', 'freight_value']], on='order_id', how='left')
# # orders_merge_items contains additional records, because each order can consist of multiple items
# # display the count of orders by the number of products it contains
# num_items = orders_merge_items['order_id'].value_counts().reset_index().rename(columns={'count': 'num_items_in_order'})
# num_orders = num_items.groupby('num_items_in_order')['order_id'].count().reset_index().rename(columns={'order_id': 'num_orders'})
# # calculate cumulative percentage
# total_orders = num_orders['num_orders'].sum()
# num_orders['cumulative_count'] = num_orders['num_orders'].cumsum()
# num_orders['cumulative_percentage'] = (num_orders['cumulative_count'] / total_orders * 100).round(2)
# num_orders
# # we can also look into an order and the number of different product sellers
# num_sellers = orders_merge_items.groupby('order_id')['seller_id'].nunique().reset_index().rename(columns={'seller_id': 'num_sellers_in_order'})
# num_orders = num_sellers.groupby('num_sellers_in_order').count().reset_index().rename(columns={'order_id': 'num_orders'})
# # calculate cumulative percentage
# total_orders = num_orders['num_orders'].sum()
# num_orders['cumulative_count'] = num_orders['num_orders'].cumsum()
# num_orders['cumulative_percentage'] = (num_orders['cumulative_count'] / total_orders * 100).round(2)
# num_orders
# we can observe that almost 99% of orders are filled by a single seller (including multiple item orders)
### Adding customer geolocation info to core orders table
# merge customer geolocation data to orders_merge_items dataset
orders_merge_items_geoc = pd.merge(orders_merge_items, customers_geo[['customer_id', 'c_geolocation_lat', 'c_geolocation_lng']], on='customer_id', how='left')
### Adding sellers geolocation info to core orders table
# merge seller geolocation data to orders_merge_items dataset
orders_merge_items_geo = pd.merge(orders_merge_items_geoc, sellers_geo[['seller_id', 's_geolocation_lat', 's_geolocation_lng']], on='seller_id', how='left')
### Remove any missing values for coordinates (for plotting)
# # check for any missing geolocation info
# print(customers_geo.isna().sum())
# # both customers and sellers datasets contain nan values for geolocation
# # it could be due to customer and sellers zip codes not included in the geolocation dataset
# nan_zip_prefixes = customers_geo[customers_geo['c_geolocation_lat'].isna()]['customer_zip_code_prefix'].unique()
# print('Missing geolocation info for these zip codes present in customers dataset: ', nan_zip_prefixes)
# print('Number of zip codes missing: ', len(nan_zip_prefixes))
# remove rows with NaN in latitude or longitude columns
orders_merge_items_geo_clean = orders_merge_items_geo.dropna(subset=['c_geolocation_lat', 'c_geolocation_lng', 's_geolocation_lat', 's_geolocation_lng'])
### Aggregate order items (number and type of products, the price and freight value) to a core orders table
### ---> a unit of analysis is a particular order customer places
# in orders_merge_items we have duplicate order_ids as orders are split up based on items
# we want to have a record for each order:
# output average delivery times, sum of price, sum of freight value, for geolocation we can take average (these values are the same for all order ids)
# we also want to preserve customer_id, seller_id (in case of customer we can take first and for seller the mode)
# create an aggregation function mapping based on the data types of the columns
agg_funcs = {}
# numeric columns: we can apply mean or sum
numeric_columns = orders_merge_items_geo_clean.select_dtypes(include=['float64', 'int64', 'timedelta64[ns]']).columns
for col in numeric_columns:
if 'price' in col or 'freight_value' in col: # Use sum for price-related fields
agg_funcs[col] = 'sum'
else: # Use mean for other numeric fields
agg_funcs[col] = 'mean'
# object columns: use mode (most frequent value)
object_columns = orders_merge_items_geo_clean.select_dtypes(include=['object']).columns
for col in object_columns:
if col == 'order_id': # We are grouping by order_id, so no need to aggregate it
continue
elif col == 'customer_id': # Taking first customer_id (assuming one customer per order)
agg_funcs[col] = 'first'
elif col == 'seller_id' or col == 'product_id': # Mode for seller_id and product_id
agg_funcs[col] = lambda x: x.mode()[0]
else: # For other object fields, use the first entry (e.g., order_status)
agg_funcs[col] = 'first'
# Datetime columns: take the first timestamp (chronologically)
datetime_columns = orders_merge_items_geo_clean.select_dtypes(include=['datetime64[ns]', 'period[M]']).columns
for col in datetime_columns:
agg_funcs[col] = 'first'
# Apply the aggregation
orders_merged = orders_merge_items_geo_clean.groupby('order_id').agg(agg_funcs).reset_index()
### Adding relavant payments info (e.g. payment type, # of installments, payment value) to core orders table
# order_payments['payment_sequential'].value_counts().reset_index()
# we need to group order_payments by order_id and sum the payment values
# for payment type we take the most common and for payment installments we take max
# group by 'order_id' and aggregate
order_payments_aggregated = order_payments.groupby('order_id').agg(
total_payment_value=('payment_value', 'sum'), # sum of payment values
payment_type=('payment_type', lambda x: x.mode()[0]), # most common payment type
payment_installments=('payment_installments', 'max') # max number of installments
).reset_index()
# order_payments_aggregated.head()
# for each order id we also want to add info from payments dataset
orders_merged = pd.merge(orders_merged, order_payments_aggregated[['order_id', 'payment_type', 'payment_installments', 'total_payment_value']], on='order_id', how='left')
### Adding product relavant info to core orders table
orders_merged = pd.merge(orders_merged, products[['product_id', 'product_category_name', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']], on='product_id', how='left')
orders_merged['product_vol'] = orders_merged['product_length_cm'] * orders_merged['product_height_cm'] * orders_merged['product_width_cm']
### Adding review score to core orders table
# add the review score to see if reviews somehow correlate with delivery times/ delivery performance metrics
orders_merged = pd.merge(orders_merged, reviews[['order_id', 'review_score']], on='order_id', how='left')
| Category | Metrics |
|---|---|
| Delivery Time Metrics | Preprocessing time |
| Carrier takeover time | |
| Customer delivery time | |
| Total delivery time | |
| Time to estimate (days) | |
| Late deliveries (%) | |
| Delivery Performance Metrics | Distance (km) |
Distance efficiency (distance_km / delivery_time_days) |
|
Freight cost per km (freight_value / distance_km) |
|
Price to distance ratio (price / distance_km) |
|
Cost efficiency (freight_value / delivery_time_days) |
|
Order size per km (price / distance_km) |
|
Total order cost per km (total_payment_value / distance_km) |
|
Freight share (freight_value / total_payment_value) |
|
| Percentage Metrics | Preprocessing time % |
| Carrier takeover time % | |
| Customer delivery time % |
# @title Calculations
import numpy as np
import pandas as pd
# Create a copy for calculations
orders = orders_merged.copy()
# Ensure relevant columns are datetime
datetime_cols = [
'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date',
'order_delivered_customer_date', 'order_estimated_delivery_date'
]
for col in datetime_cols:
orders[col] = pd.to_datetime(orders[col], errors='coerce')
# Calculate delivery performance metrics
orders['order_processing_time'] = orders['order_approved_at'] - orders['order_purchase_timestamp']
orders['carrier_delivery_time'] = orders['order_delivered_carrier_date'] - orders['order_approved_at']
orders['customer_delivery_time'] = orders['order_delivered_customer_date'] - orders['order_delivered_carrier_date']
orders['time_to_estimate_delivery'] = orders['order_delivered_customer_date'] - orders['order_estimated_delivery_date']
# Calculate total delivery time
orders['delivery_time'] = orders['order_delivered_customer_date'] - orders['order_purchase_timestamp']
# Convert these timedeltas to total days for easier interpretation/plotting
orders['order_processing_time_days'] = orders['order_processing_time'].dt.total_seconds() / (24 * 3600)
orders['carrier_delivery_time_days'] = orders['carrier_delivery_time'].dt.total_seconds() / (24 * 3600)
orders['customer_delivery_time_days'] = orders['customer_delivery_time'].dt.total_seconds() / (24 * 3600)
orders['time_to_estimate_delivery_days'] = orders['time_to_estimate_delivery'].dt.total_seconds() / (24 * 3600)
orders['delivery_time_days'] = orders['delivery_time'].dt.total_seconds() / (24 * 3600)
# For each order, calculate the contribution of different delivery times
orders['processing_share'] = orders['order_processing_time_days'] / orders['delivery_time_days']
orders['carrier_share'] = orders['carrier_delivery_time_days'] / orders['delivery_time_days']
orders['customer_share'] = orders['customer_delivery_time_days'] / orders['delivery_time_days']
# Identify and handle negative or invalid delivery times
# print('Length of dataframe before: ', len(orders))
negative_processing = (orders['order_processing_time_days'] < 0).sum()
negative_carrier = (orders['carrier_delivery_time_days'] < 0).sum()
negative_customer = (orders['customer_delivery_time_days'] < 0).sum()
negative_total = (orders['delivery_time_days'] < 0).sum()
# print('Number of negative processing times: ', negative_processing)
# print('Number of negative carrier delivery times: ', negative_carrier)
# print('Number of negative customer delivery times: ', negative_customer)
# print('Number of negative total delivery times: ', negative_total)
# Remove records with negative times
orders = orders[
(orders['order_processing_time_days'] >= 0) &
(orders['carrier_delivery_time_days'] >= 0) &
(orders['customer_delivery_time_days'] >= 0) &
(orders['delivery_time_days'] >= 0)
]
# Define the Haversine function for distance calculation
def haversine(lat1, lon1, lat2, lon2):
R = 6371 # Earth radius in kilometers
phi1, phi2 = np.radians(lat1), np.radians(lat2)
delta_phi = np.radians(lat2 - lat1)
delta_lambda = np.radians(lon2 - lon1)
a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2)**2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
return R * c # Distance in kilometers
# Calculate distances for each order
orders['distance_km'] = haversine(
orders['c_geolocation_lat'],
orders['c_geolocation_lng'],
orders['s_geolocation_lat'],
orders['s_geolocation_lng']
)
# Additional metrics
orders['distance_efficiency'] = orders['distance_km'] / orders['delivery_time_days']
orders['freight_cost_per_km'] = orders['freight_value'] / orders['distance_km']
orders['price_to_distance'] = orders['price'] / orders['distance_km']
orders['cost_efficiency'] = orders['freight_value'] / orders['delivery_time_days']
orders['order_size_per_km'] = orders['price'] / orders['distance_km']
orders['total_order_cost_per_km'] = orders['total_payment_value'] / orders['distance_km']
orders['freight_share'] = orders['freight_value'] / orders['total_payment_value']
orders['late_delivery'] = orders['time_to_estimate_delivery_days'].apply(lambda x: 1 if x > 0 else 0)
# Replace infinities and NaNs
numeric_cols = [
'distance_km', 'distance_efficiency', 'freight_cost_per_km', 'price_to_distance',
'cost_efficiency', 'order_size_per_km', 'total_order_cost_per_km', 'freight_share'
]
orders[numeric_cols] = orders[numeric_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
# @title Box Plots of Delivery Times
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
# Set global font and style for the plot
mpl.rcParams.update({
"font.family": "serif",
"axes.titlesize": 16,
"axes.labelsize": 12,
"xtick.labelsize": 10,
"ytick.labelsize": 10,
})
# Define the relevant columns for outlier detection
columns_to_check = [
'delivery_time_days', 'order_processing_time_days', 'carrier_delivery_time_days',
'customer_delivery_time_days', 'time_to_estimate_delivery_days', 'distance_km'
]
# # Set up the figure for multiple subplots
# plt.figure(figsize=(15, 10))
# viridis_colors = plt.cm.viridis(np.linspace(0, 1, len(columns_to_check)))
# # Create individual box plots for each column
# for i, col in enumerate(columns_to_check, 1):
# plt.subplot(3, (len(columns_to_check) + 2) // 3, i)
# sns.boxplot(data=orders_merged[col], orient='v', color=viridis_colors[i-1]) # Use Viridis color
# # Set title for each subplot
# plt.title(col, fontsize=12)
# # Customize spines for each subplot
# ax = plt.gca()
# ax.spines['top'].set_visible(False)
# ax.spines['right'].set_visible(False)
# ax.spines['left'].set_visible(False)
# # Adjust layout and display the plot
# plt.tight_layout()
# plt.show()
# @title Removing outliers
# Function to remove outliers using IQR method
def remove_outliers_iqr(df, columns):
# Create a copy to avoid modifying the original dataframe
df_clean = df.copy()
# Loop through each selected column to remove outliers
for col in columns:
Q1 = df_clean[col].quantile(0.25) # 1st quartile (25th percentile)
Q3 = df_clean[col].quantile(0.75) # 3rd quartile (75th percentile)
IQR = Q3 - Q1 # Interquartile Range
# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Filter out the outliers
df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
return df_clean
# apply the outlier removal function to the selected columns
orders_cleaned = remove_outliers_iqr(orders, columns_to_check)
# # display the cleaned dataframe
# print("Dataset shape after:", orders_cleaned.shape)
import numpy as np
def replace_inf_with_thresholds(df, metrics):
"""
Replace +inf with (mean + 3*std) and -inf with (mean - 3*std) for the specified metrics in the DataFrame.
"""
df_copy = df.copy() # Work on a copy of the DataFrame
for metric in metrics:
# Create a temporary copy of the metric column and replace inf values with NaN
temp_metric = df_copy[metric].copy()
temp_metric.replace([np.inf, -np.inf], np.nan, inplace=True)
# Calculate mean and std, excluding NaN values
mean_value = temp_metric.mean()
std_value = temp_metric.std()
# Replace +inf values with mean + 3 * std
df_copy[metric] = np.where(
df_copy[metric] == np.inf, mean_value + 3 * std_value, df_copy[metric]
)
# Replace -inf values with mean - 3 * std
df_copy[metric] = np.where(
df_copy[metric] == -np.inf, mean_value - 3 * std_value, df_copy[metric]
)
return df_copy
# List of metrics to clean from inf values
metrics_to_clean = [
'freight_cost_per_km', 'price_to_distance', 'cost_efficiency',
'order_size_per_km', 'total_order_cost_per_km', 'freight_share', 'review_score'
]
# Apply the function to the dataframe with the listed metrics
orders_cleaned = replace_inf_with_thresholds(orders_cleaned, metrics_to_clean)
Rio de Janeiro, Sao Paulo, Porto Alegre, Curitiba, Fortaleza, Bahia, Brasilia
# @title Assigning city labels based on geographic coordinates
# add a city label based on predefined centered city coordinates
import pandas as pd
from geopy.distance import geodesic
# Define cities coordinates (lat, lng)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
#'Fortaleza': (-3.7172, -38.5434),
'Salvador': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Function to determine city label based on coordinates
def label_city(row):
for city, coords in cities_coords.items():
if geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), coords).km <= 50: # 50 km radius
return city
return None # Return None if no city is found within the radius
# Create a new column in orders_cleaned for city labels
orders_cleaned['city'] = orders_cleaned.apply(label_city, axis=1)
Exploring delivery times based on customer location is essential for several reasons:
- Logistics Optimization: Delivery times vary due to geographic factors (distance, traffic, infrastructure). By analyzing regional patterns, companies can identify areas where delays occur and optimize routing, adjust warehouse placement, or streamline processes for better efficiency.
- Customer Experience: Understanding location-based delivery performance enables companies to set more accurate delivery expectations, improving customer satisfaction and reducing complaints about late deliveries.
- Cost Management: By pinpointing regions with frequent delays, companies can assess whether the delivery costs are proportionate to customer locations and adjust pricing, warehousing, or transportation strategies.
# @title Delivery Performance in Major Brazilian Cities (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
'Fortaleza': (-3.7172, -38.5434),
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = orders_cleaned[
orders_cleaned.apply(
lambda row: geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Calculate global min and max delivery times across all cities for consistent color mapping
all_delivery_times = []
for coords in cities_coords.values():
city_data = filter_city_data(coords)
all_delivery_times.extend(city_data['delivery_time_days'].dropna())
# Get the global min and max for color mapping
global_min = min(all_delivery_times)
global_max = max(all_delivery_times)
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(17, 12))
axes = axes.flatten()
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Use all data for the city
if len(city_data) > 0: # Ensure there's data to plot
# Create a scatterplot showing delivery times with inverted Viridis colormap
scatter = sns.scatterplot(
x=city_data['c_geolocation_lng'],
y=city_data['c_geolocation_lat'],
hue=city_data['delivery_time_days'], # Use delivery time for color
palette='viridis_r', # Inverted Viridis for delivery time (lower is better)
size=city_data['total_payment_value'], # Size points by payment value
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable legend for individual plots
)
ax.set_title(f"Delivery Performance in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar based on the delivery times
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=plt.Normalize(global_min, global_max))
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Delivery Time (Days)')
# Remove any unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(top=0.92, hspace=0.4) # Adjust space between subplots
plt.suptitle('Delivery Performance in Major Brazilian Cities, Dot Size by Customer Payment Value', fontsize=16, y=0.98)
plt.show()
---> higher values indicate faster deliveries across distances
Analyzing distance efficiency based on customer location is important because:
- Operational Optimization: Distance efficiency reveals how effectively routes are being utilized relative to delivery times. It helps identify inefficiencies in delivery networks and provides insights into how to optimize routes and resources for improved service.
- Geographic Challenges: Distance efficiency can highlight areas with challenging infrastructure or topography that may cause inefficiencies. Understanding these factors helps adjust strategies for specific regions, such as choosing better-suited transportation methods or adding local warehouses.
- Cost Efficiency: Tracking distance efficiency helps identify where excessive fuel or resources may be used due to poor routing or geographic challenges, leading to potential cost reductions in logistics and fuel consumption.
# @title Delivery Performance in Major Brazilian Cities (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
'Fortaleza': (-3.7172, -38.5434),
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = orders_cleaned[
orders_cleaned.apply(
lambda row: geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Calculate global min and max distance efficiency across all cities for consistent color mapping
all_distance_efficiency = []
for coords in cities_coords.values():
city_data = filter_city_data(coords)
all_distance_efficiency.extend(city_data['distance_efficiency'].dropna())
# Get the global min and max for color mapping
global_min = min(all_distance_efficiency)
global_max = max(all_distance_efficiency)
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(17, 12))
axes = axes.flatten()
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Use all data for the city
if len(city_data) > 0: # Ensure there's data to plot
# Create a scatterplot showing distance efficiency with Viridis colormap
scatter = sns.scatterplot(
x=city_data['c_geolocation_lng'],
y=city_data['c_geolocation_lat'],
hue=city_data['distance_efficiency'], # Use distance efficiency for color
palette='viridis', # Viridis colormap (not inverted)
size=city_data['total_payment_value'], # Size points by payment value
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable legend for individual plots
)
ax.set_title(f"Delivery Performance in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar based on the distance efficiency
sm = plt.cm.ScalarMappable(cmap='viridis', norm=plt.Normalize(global_min, global_max))
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Distance Efficiency')
# Remove any unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(top=0.92, hspace=0.4) # Adjust space between subplots
plt.suptitle('Delivery Performance in Major Brazilian Cities\nDot Size by Customer Payment Value', fontsize=16, y=0.98)
plt.show()
Analyzing customer review scores based on location provides critical insights into the overall service quality:
- Service Quality by Region: Customer satisfaction can vary across different regions due to factors such as delivery speed, product availability, or local preferences. Identifying areas with lower review scores helps target improvements in those regions.
- Logistics Impact: Geographical issues, such as infrastructure or delivery delays, can negatively impact customer experiences and reviews. Monitoring scores regionally helps identify if logistics challenges are influencing customer satisfaction.
- Targeted Improvements: By understanding the relationship between location and review scores, companies can implement targeted actions, such as improving last-mile delivery services, offering local support, or addressing specific regional complaints to boost customer experience.
# @title Customer Reviews in Major Brazilian Cities (Dot Size by Total Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
'Fortaleza': (-3.7172, -38.5434),
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = orders_cleaned[
orders_cleaned.apply(
lambda row: geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Calculate global min and max review scores across all cities for consistent color mapping
all_review_scores = []
for coords in cities_coords.values():
city_data = filter_city_data(coords)
all_review_scores.extend(city_data['review_score'].dropna())
# Get the global min and max for color mapping
global_min = min(all_review_scores)
global_max = max(all_review_scores)
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(17, 12))
axes = axes.flatten()
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Use all data for the city
if len(city_data) > 0: # Ensure there's data to plot
# Create a scatterplot showing customer review scores with Viridis colormap
scatter = sns.scatterplot(
x=city_data['c_geolocation_lng'],
y=city_data['c_geolocation_lat'],
hue=city_data['review_score'], # Use review scores for color
palette='viridis', # Viridis colormap (not inverted)
size=city_data['total_payment_value'], # Size points by payment value
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable legend for individual plots
)
ax.set_title(f"Customer Review Scores in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar based on the review scores
sm = plt.cm.ScalarMappable(cmap='viridis', norm=plt.Normalize(global_min, global_max))
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Review Score')
# Remove any unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(top=0.92, hspace=0.4) # Adjust space between subplots
plt.suptitle('Customer Reviews in Major Brazilian Cities, Dot Size by Total Payment Value', fontsize=16, y=0.98)
plt.show()
Analyzing "Time to Estimate Delivery Days" based on customer location is important because:
- Accuracy in Delivery Predictions: Understanding how well delivery estimates align with actual delivery times across different regions allows businesses to improve their prediction models. This ensures customers have more accurate expectations for their orders.
- Regional Variations: Certain cities or regions may consistently show longer or shorter time estimates due to local conditions, infrastructure, or logistical challenges. Identifying these patterns helps adjust estimations to be region-specific.
- Customer Satisfaction: Providing more accurate delivery time estimates leads to higher customer satisfaction. Misalignments between estimated and actual delivery times can result in dissatisfaction, complaints, or negative reviews.
# @title Time To Estimate Delivery Days in Major Brazilian Cities (Dot Size by Total Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
'Fortaleza': (-3.7172, -38.5434),
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = orders_cleaned[
orders_cleaned.apply(
lambda row: geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Calculate global min and max time to estimate delivery days across all cities for consistent color mapping
all_time_to_estimate = []
for coords in cities_coords.values():
city_data = filter_city_data(coords)
all_time_to_estimate.extend(city_data['time_to_estimate_delivery_days'].dropna())
# Get the global min and max for color mapping
global_min = min(all_time_to_estimate)
global_max = max(all_time_to_estimate)
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(17, 12))
axes = axes.flatten()
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Use all data for the city
if len(city_data) > 0: # Ensure there's data to plot
# Create a scatterplot showing time to estimate delivery days with inverted Viridis colormap
scatter = sns.scatterplot(
x=city_data['c_geolocation_lng'],
y=city_data['c_geolocation_lat'],
hue=city_data['time_to_estimate_delivery_days'], # Use delivery time for color
palette='viridis_r', # Inverted Viridis colormap (lower times are better)
size=city_data['total_payment_value'], # Size points by payment value
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable legend for individual plots
)
ax.set_title(f"Time To Estimate Delivery Days in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar based on time to estimate delivery days
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=plt.Normalize(global_min, global_max))
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Time To Estimate Delivery Days')
# Remove any unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(top=0.92, hspace=0.4) # Adjust space between subplots
plt.suptitle('Time To Estimate Delivery Days in Major Brazilian Cities, Dot Size by Total Payment Value', fontsize=16, y=0.98)
plt.show()
Plotting delivery time days based on seller location is essential for understanding the geographical impact of delivery performance:
- Seller-Based Logistics Insights: By visualizing delivery times relative to the seller's location, companies can assess the efficiency of their sellers. This enables identification of specific sellers or regions with longer delivery times, allowing for targeted improvements in supply chain or logistics management.
- Operational Bottlenecks: Regions where sellers consistently have higher delivery times may indicate logistical bottlenecks, such as poor infrastructure or longer routes. Addressing these issues can reduce delivery delays and improve customer satisfaction.
- Seller Performance Monitoring: Sellers with higher volumes of orders may have different delivery time patterns compared to those handling fewer orders. Monitoring the relationship between delivery times and the number of orders allows businesses to support high-volume sellers with better logistical planning and resources.
# @title Delivery Performance in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Define sellers_grouped from orders_cleaned
# Grouping sellers and aggregating metrics
sellers_grouped = orders_cleaned.groupby(['seller_id', 's_geolocation_lat', 's_geolocation_lng']).agg(
num_orders=('order_id', 'count'), # Total number of orders per seller
avg_freight_value=('freight_value', 'mean'), # Average freight value
avg_order_value=('total_payment_value', 'mean'), # Average order value
avg_delivery_time=('delivery_time_days', 'mean'), # Average delivery time in days
time_to_estimate_delivery_days=('time_to_estimate_delivery_days', 'mean') # Average estimated delivery time
).reset_index()
# Add delivery time for easier plotting
sellers_grouped['delivery_time_days'] = sellers_grouped['avg_delivery_time']
# Ensure valid coordinates
sellers_grouped['s_geolocation_lat'] = pd.to_numeric(sellers_grouped['s_geolocation_lat'], errors='coerce')
sellers_grouped['s_geolocation_lng'] = pd.to_numeric(sellers_grouped['s_geolocation_lng'], errors='coerce')
sellers_grouped.dropna(subset=['s_geolocation_lat', 's_geolocation_lng'], inplace=True) # Drop invalid coordinates
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = sellers_grouped[
sellers_grouped.apply(
lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(14, 12))
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Calculate delivery delays
city_data['delivery_delay'] = city_data['delivery_time_days'] - city_data['time_to_estimate_delivery_days']
# Create a scatterplot showing delivery delays with inverted Viridis colormap
scatter = sns.scatterplot(
x=city_data['s_geolocation_lng'],
y=city_data['s_geolocation_lat'],
hue=city_data['delivery_time_days'], # Use delivery time for color
palette='viridis_r', # Inverted Viridis (lower delivery times are better)
size=city_data['num_orders'], # Size of points based on number of orders
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable individual legend for the scatter plot
)
ax.set_title(f"Delivery Performance in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar for delivery_time_days
norm = plt.Normalize(vmin=sellers_grouped['delivery_time_days'].min(), vmax=sellers_grouped['delivery_time_days'].max())
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=norm) # Use inverted Viridis colormap
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Delivery Time Days')
# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9) # Adjust top to give space for the main title
plt.suptitle('Delivery Performance in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
Plotting distance efficiency based on seller location provides key insights into how well sellers are optimizing their deliveries over various distances:
- Logistical Optimization: This visualization helps identify which sellers or regions are most efficient in managing the distance between their location and customers. Sellers with high distance efficiency are likely optimizing their logistics and transportation.
- Regional Efficiency Patterns: By comparing cities, we can determine if certain cities consistently have more efficient sellers, possibly due to better infrastructure, transport networks, or delivery hubs.
- High Volume vs. Efficiency Trade-off: By correlating the number of orders with distance efficiency, we can assess if high-volume sellers are sacrificing efficiency for order fulfillment, or if they manage to maintain both high order volumes and efficient delivery practices.
# @title Distance Efficiency in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Define sellers_grouped by aggregating relevant metrics
sellers_grouped = orders_cleaned.groupby(['seller_id', 's_geolocation_lat', 's_geolocation_lng']).agg(
num_orders=('order_id', 'count'), # Total number of orders per seller
avg_distance_efficiency=('distance_efficiency', 'mean'), # Average distance efficiency
avg_delivery_time=('delivery_time_days', 'mean') # Average delivery time in days
).reset_index()
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = sellers_grouped[
sellers_grouped.apply(
lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(14, 12))
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Create a scatterplot showing distance efficiency with Viridis colormap
scatter = sns.scatterplot(
x=city_data['s_geolocation_lng'],
y=city_data['s_geolocation_lat'],
hue=city_data['avg_distance_efficiency'], # Use average distance efficiency for color
palette='viridis', # Viridis colormap (not inverted)
size=city_data['num_orders'], # Size of points based on number of orders
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable individual legend for the scatter plot
)
ax.set_title(f"Distance Efficiency in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar for distance efficiency
norm = plt.Normalize(vmin=sellers_grouped['avg_distance_efficiency'].min(), vmax=sellers_grouped['avg_distance_efficiency'].max())
sm = plt.cm.ScalarMappable(cmap='viridis', norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Average Distance Efficiency')
# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9) # Adjust top to give space for the main title
plt.suptitle('Distance Efficiency in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
Analyzing Freight Ratio to Understand Transportation Costs
The freight ratio is calculated by dividing the freight value by the order value, providing an indication of the percentage of the product price that customers pay for delivery. For instance, if a product costs 50.00 eur and the freight value is 10.00 eur, the freight ratio would be 0.2 or 20%. A higher freight ratio may discourage customers from completing their purchases.
Logistics costs typically result in varying freight ratios based on population density. Densely populated areas generally exhibit lower freight ratios due to optimized logistics, while sparsely populated regions are likely to have higher freight ratios. Understanding these patterns can provide valuable insights into customer behavior and logistics efficiency.
Plotting freight share based on seller locations is essential for understanding the cost dynamics of logistics in various regions:
- Cost Management: This visualization reveals how much of the total order cost is attributed to freight expenses for different sellers. It allows businesses to identify areas where costs can be optimized, leading to improved profitability.
- Performance Benchmarking: By comparing freight shares across cities and sellers, organizations can benchmark performance. High freight share in certain locations might indicate inefficiencies or higher logistical challenges that need to be addressed.
- Strategic Decision-Making: Understanding freight share helps in making informed decisions about supplier selection, pricing strategies, and resource allocation. It can guide companies on where to invest in infrastructure improvements or adjust pricing models to better compete.
- Customer Experience: High freight costs can impact overall pricing and delivery times. By analyzing freight share, businesses can assess how logistics affect customer satisfaction and make necessary adjustments to meet customer expectations effectively.
# @title Freight Share in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Define sellers_grouped by aggregating relevant metrics for freight share
sellers_grouped = orders_cleaned.groupby(['seller_id', 's_geolocation_lat', 's_geolocation_lng']).agg(
num_orders=('order_id', 'count'), # Total number of orders per seller
avg_freight_share=('freight_share', 'mean'), # Average freight share
avg_delivery_time=('delivery_time_days', 'mean'), # Average delivery time in days
total_freight=('freight_value', 'sum') # Total freight value handled by the seller
).reset_index()
# Rename 'avg_freight_share' to 'freight_share' for consistency in plotting
sellers_grouped.rename(columns={'avg_freight_share': 'freight_share'}, inplace=True)
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = sellers_grouped[
sellers_grouped.apply(
lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(14, 12))
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Create a scatterplot showing freight share with inverted Viridis colormap
scatter = sns.scatterplot(
x=city_data['s_geolocation_lng'],
y=city_data['s_geolocation_lat'],
hue=city_data['freight_share'], # Use freight share for color
palette='viridis_r', # Inverted Viridis (higher freight share is less favorable)
size=city_data['num_orders'], # Size of points based on number of orders
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable individual legend for the scatter plot
)
ax.set_title(f"Freight Share in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar for freight share
norm = plt.Normalize(vmin=sellers_grouped['freight_share'].min(), vmax=sellers_grouped['freight_share'].max())
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=norm) # Use inverted Viridis colormap
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Freight Share')
# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9) # Adjust top to give space for the main title
plt.suptitle('Freight Share in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
Plotting the "time to estimate delivery days" based on seller location and order size is important for several reasons:
- Identifying Regional Delivery Performance: By visualizing delivery times geographically, businesses can easily identify areas where sellers may face longer delivery delays, enabling targeted interventions to improve logistics and customer service in those regions.
- Assessing Impact of Order Volume: The size of the points on the plot indicates the number of orders each seller has filled. This allows businesses to analyze whether high-order-volume sellers are able to maintain timely deliveries, highlighting potential capacity issues that could impact customer satisfaction.
- Strategic Resource Allocation: Understanding the relationship between seller location, delivery times, and order size enables businesses to make informed decisions on resource allocation, such as optimizing logistics support or enhancing seller performance in specific areas, ultimately driving operational efficiency and customer satisfaction.
# @title Time to Estimate Delivery Days in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
# Group sellers by location and calculate `time_to_estimate_delivery_days` mean
sellers_grouped = orders_cleaned.groupby(['seller_id', 's_geolocation_lat', 's_geolocation_lng']).agg(
time_to_estimate_delivery_days=('time_to_estimate_delivery_days', 'mean'),
num_orders=('order_id', 'count') # Count number of orders per seller
).reset_index()
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = sellers_grouped[
sellers_grouped.apply(
lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(14, 12))
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Create a scatterplot showing `time_to_estimate_delivery_days` with inverted Viridis colormap
scatter = sns.scatterplot(
x=city_data['s_geolocation_lng'],
y=city_data['s_geolocation_lat'],
hue=city_data['time_to_estimate_delivery_days'], # Use `time_to_estimate_delivery_days` for color
palette='viridis_r', # Inverted Viridis (lower times are better)
size=city_data['num_orders'], # Size of points based on number of orders
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable individual legend for the scatter plot
)
ax.set_title(f"Time to Estimate Delivery Days in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar for `time_to_estimate_delivery_days`
norm = plt.Normalize(vmin=sellers_grouped['time_to_estimate_delivery_days'].min(), vmax=sellers_grouped['time_to_estimate_delivery_days'].max())
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=norm)
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Time to Estimate Delivery Days')
# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9) # Adjust top to give space for the main title
plt.suptitle('Time to Estimate Delivery Days in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
Plotting the total order cost per km based on seller location and order size is important for several reasons:
- Identifying High-Cost Areas: By visualizing total order costs geographically, businesses can pinpoint regions where shipping costs are disproportionately high. This information is critical for optimizing delivery routes and reducing transportation expenses in those specific areas.
- Assessing Seller Performance: The size of the points in the plot indicates the number of orders each seller has filled. Analyzing how total order cost per km correlates with order volume allows businesses to evaluate whether high-volume sellers are maintaining cost efficiency. This can inform decisions about supporting sellers that may be struggling with high logistics costs.
- Strategic Pricing Adjustments: Understanding the geographic distribution of order costs helps businesses make targeted pricing decisions. For instance, sellers operating in high-cost areas may need to adjust their pricing models to remain competitive, while also considering promotional strategies to attract customers in those regions.
# @title Total Order Cost per km in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
# Group sellers by geolocation and calculate the mean `total_order_cost_per_km` and the number of orders
sellers_grouped = orders_cleaned.groupby(['seller_id', 's_geolocation_lat', 's_geolocation_lng']).agg(
total_order_cost_per_km=('total_order_cost_per_km', 'mean'), # Average total order cost per km
num_orders=('order_id', 'count') # Count number of orders per seller
).reset_index()
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = sellers_grouped[
sellers_grouped.apply(
lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(14, 12))
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Create a scatterplot showing total order cost per km with inverted Viridis colormap
scatter = sns.scatterplot(
x=city_data['s_geolocation_lng'],
y=city_data['s_geolocation_lat'],
hue=city_data['total_order_cost_per_km'], # Use total order cost per km for color
palette='viridis_r', # Inverted Viridis if lower cost per km is better
size=city_data['num_orders'], # Size of points based on number of orders
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable individual legend for the scatter plot
)
ax.set_title(f"Total Order Cost per km in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar for total order cost per km
norm = plt.Normalize(vmin=sellers_grouped['total_order_cost_per_km'].min(), vmax=sellers_grouped['total_order_cost_per_km'].max())
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=norm)
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Total Order Cost per km')
# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9) # Adjust top to give space for the main title
plt.suptitle('Total Order Cost per km in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
- Focused Analysis on Key Contributors: The 10 largest sellers account for a significant share of transactions, making them critical to overall performance. By isolating these sellers, businesses can better understand the trends and challenges associated with their most impactful players.
- Targeted Performance Insights: Analyzing delivery performance metrics, such as delivery times, for top sellers helps pinpoint operational inefficiencies. Identifying delays or inconsistencies specific to these sellers allows for focused interventions, leading to more effective logistics strategies.
- Regional Impact and Coverage: By plotting customer delivery points and seller locations, businesses can evaluate the geographic spread and performance consistency of their largest sellers. This insight supports decisions regarding logistics improvements, warehouse placements, or regional resource allocation.
- Data-Driven Decision-Making: Comparing delivery times and order sizes across these top sellers facilitates strategic planning. Businesses can recognize top-performing sellers and replicate their best practices while identifying underperforming ones that require support or revised agreements.
- Customer Experience Optimization: Visualizing seller-customer interactions for major contributors provides insights into how well these sellers are meeting customer expectations. Enhancing their delivery performance can significantly impact overall customer satisfaction and retention.
# @title Delivery Performance of 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()
# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(17, 12))
axes = axes.flatten() # Flatten the axes array for easier indexing
# Get global min and max for delivery time in days for consistent color mapping
global_vmin = orders_cleaned['delivery_time_days'].min()
global_vmax = orders_cleaned['delivery_time_days'].max()
# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
# Filter data for the current seller
seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()
# Check if there's data for the seller
if not seller_data.empty:
# Define custom colormap from viridis, inverted
cmap = colormaps['viridis_r']
# Normalize based on global min and max delivery time (not per seller)
norm = Normalize(vmin=global_vmin, vmax=global_vmax)
# Create a scatterplot of customer locations based on delivery time
scatter = sns.scatterplot(
x=seller_data['c_geolocation_lng'],
y=seller_data['c_geolocation_lat'],
hue=seller_data['delivery_time_days'], # Use delivery_time_days as color metric
palette=cmap, # Custom colormap
size=seller_data['total_payment_value'], # Size based on total_payment_value
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Plot the seller's location as a black dot with a smaller size
axes[i].scatter(
seller_data['s_geolocation_lng'].mean(),
seller_data['s_geolocation_lat'].mean(),
color='black',
s=150, # Size for seller's location
label='Seller Location'
)
# Add subtitle for each plot based on seller ranking
titles = [
"Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
"4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
"7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
]
axes[i].set_title(titles[i], fontsize=14, pad=20)
# Set labels for the axes and adjust padding
axes[i].set_xlabel('Longitude', labelpad=20)
axes[i].set_ylabel('Latitude', labelpad=20)
# Remove the top, left, and right spines
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Create a global colorbar for delivery time days
cmap = colormaps['viridis_r']
norm = Normalize(vmin=global_vmin, vmax=global_vmax) # Global normalization
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes,
orientation='horizontal',
fraction=0.05,
pad=0.12
)
cbar.set_label('Delivery Time (days)', fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(
top=0.85,
bottom=0.18,
hspace=0.6,
wspace=0.35
)
plt.suptitle('Delivery Performance of 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)
# Show the plot
plt.show()
# @title Time to Estimate Delivery Days of 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()
# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(17, 12)) # Adjusted to 3 rows and 4 columns, larger figure size
axes = axes.flatten() # Flatten the axes array for easier indexing
# Get global min and max for time to estimate delivery days for consistent color mapping
global_vmin = orders_cleaned['time_to_estimate_delivery_days'].min()
global_vmax = orders_cleaned['time_to_estimate_delivery_days'].max()
# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
# Filter data for the current seller
seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()
# Check if there's data for the seller
if not seller_data.empty:
# Define custom colormap from viridis, inverted
cmap = colormaps['viridis_r']
# Normalize based on global min and max time to estimate delivery (not per seller)
norm = Normalize(vmin=global_vmin, vmax=global_vmax)
# Create a scatterplot of customer locations based on time to estimate delivery
scatter = sns.scatterplot(
x=seller_data['c_geolocation_lng'],
y=seller_data['c_geolocation_lat'],
hue=seller_data['time_to_estimate_delivery_days'], # Use time to estimate delivery as color metric
palette=cmap, # Custom colormap
size=seller_data['total_payment_value'], # Size based on total_payment_value
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Plot the seller's location as a black dot with a smaller size
axes[i].scatter(
seller_data['s_geolocation_lng'].mean(),
seller_data['s_geolocation_lat'].mean(),
color='black',
s=150, # Size for seller's location
label='Seller Location'
)
# Add subtitle for each plot based on seller ranking
titles = [
"Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
"4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
"7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
]
axes[i].set_title(titles[i], fontsize=14, pad=20)
# Set labels for the axes and adjust padding
axes[i].set_xlabel('Longitude', labelpad=20)
axes[i].set_ylabel('Latitude', labelpad=20)
# Remove the top, left, and right spines
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Create a global colorbar for time to estimate delivery days
cmap = colormaps['viridis_r']
norm = Normalize(vmin=global_vmin, vmax=global_vmax) # Global normalization
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes,
orientation='horizontal',
fraction=0.05,
pad=0.12
)
cbar.set_label('Time to Estimate Delivery Days', fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(
top=0.85,
bottom=0.18,
hspace=0.6,
wspace=0.35
)
plt.suptitle('Time to Estimate Delivery Days of 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)
# Show the plot
plt.show()
# @title Customer Freight Share for 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()
# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(17, 12)) # Adjusted to 3 rows and 4 columns, larger figure size
axes = axes.flatten() # Flatten the axes array for easier indexing
# Get global min and max for freight share for consistent color mapping
global_vmin = orders_cleaned['freight_share'].min()
global_vmax = orders_cleaned['freight_share'].max()
# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
# Filter data for the current seller
seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()
# Check if there's data for the seller
if not seller_data.empty:
# Define custom colormap from viridis, inverted
cmap = colormaps['viridis_r']
# Normalize based on global min and max freight share (not per seller)
norm = Normalize(vmin=global_vmin, vmax=global_vmax)
# Create a scatterplot of customer locations based on freight share
scatter = sns.scatterplot(
x=seller_data['c_geolocation_lng'],
y=seller_data['c_geolocation_lat'],
hue=seller_data['freight_share'], # Use freight share as color metric
palette=cmap, # Custom colormap
size=seller_data['total_payment_value'], # Size based on total_payment_value
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Plot the seller's location as a black dot with a smaller size
axes[i].scatter(
seller_data['s_geolocation_lng'].mean(),
seller_data['s_geolocation_lat'].mean(),
color='black',
s=150, # Size for seller's location
label='Seller Location'
)
# Add subtitle for each plot based on seller ranking
titles = [
"Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
"4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
"7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
]
axes[i].set_title(titles[i], fontsize=14, pad=20)
# Set labels for the axes and adjust padding
axes[i].set_xlabel('Longitude', labelpad=20)
axes[i].set_ylabel('Latitude', labelpad=20)
# Remove the top, left, and right spines
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Create a global colorbar for freight share
cmap = colormaps['viridis_r']
norm = Normalize(vmin=global_vmin, vmax=global_vmax) # Global normalization
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes,
orientation='horizontal',
fraction=0.05,
pad=0.12
)
cbar.set_label('Freight Share', fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(
top=0.85,
bottom=0.18,
hspace=0.6,
wspace=0.35
)
plt.suptitle('Customer Freight Share for 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)
# Show the plot
plt.show()
# @title Cost Efficiency of 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()
# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(17, 12)) # Adjusted to 3 rows and 4 columns, larger figure size
axes = axes.flatten() # Flatten the axes array for easier indexing
# Get global min and max for cost efficiency for consistent color mapping
global_vmin = orders_cleaned['cost_efficiency'].min()
global_vmax = orders_cleaned['cost_efficiency'].max()
# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
# Filter data for the current seller
seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()
# Check if there's data for the seller
if not seller_data.empty:
# Define custom colormap from viridis
cmap = colormaps['viridis']
# Normalize based on global min and max cost efficiency (not per seller)
norm = Normalize(vmin=global_vmin, vmax=global_vmax)
# Create a scatterplot of customer locations based on cost efficiency
scatter = sns.scatterplot(
x=seller_data['c_geolocation_lng'],
y=seller_data['c_geolocation_lat'],
hue=seller_data['cost_efficiency'], # Use cost efficiency as color metric
palette=cmap, # Custom colormap
size=seller_data['total_payment_value'], # Size based on total_payment_value
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Plot the seller's location as a black dot with a smaller size
axes[i].scatter(
seller_data['s_geolocation_lng'].mean(),
seller_data['s_geolocation_lat'].mean(),
color='black',
s=150, # Size for seller's location
label='Seller Location'
)
# Add subtitle for each plot based on seller ranking
titles = [
"Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
"4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
"7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
]
axes[i].set_title(titles[i], fontsize=14, pad=20)
# Set labels for the axes and adjust padding
axes[i].set_xlabel('Longitude', labelpad=20)
axes[i].set_ylabel('Latitude', labelpad=20)
# Remove the top, left, and right spines
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Create a global colorbar for cost efficiency
cmap = colormaps['viridis']
norm = Normalize(vmin=global_vmin, vmax=global_vmax) # Global normalization
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes,
orientation='horizontal',
fraction=0.05,
pad=0.12
)
cbar.set_label('Cost Efficiency', fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(
top=0.85,
bottom=0.18,
hspace=0.6,
wspace=0.35
)
plt.suptitle('Cost Efficiency of 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)
# Show the plot
plt.show()
# @title Customer Review Score of 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()
# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(17, 12)) # Adjusted to 3 rows and 4 columns, larger figure size
axes = axes.flatten() # Flatten the axes array for easier indexing
# Get global min and max for review scores for consistent color mapping
global_vmin = orders_cleaned['review_score'].min()
global_vmax = orders_cleaned['review_score'].max()
# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
# Filter data for the current seller
seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()
# Check if there's data for the seller
if not seller_data.empty:
# Define custom colormap from viridis
cmap = colormaps['viridis']
# Normalize based on global min and max review score (not per seller)
norm = Normalize(vmin=global_vmin, vmax=global_vmax)
# Create a scatterplot of customer locations based on review score
scatter = sns.scatterplot(
x=seller_data['c_geolocation_lng'],
y=seller_data['c_geolocation_lat'],
hue=seller_data['review_score'], # Use review score as color metric
palette=cmap, # Custom colormap
size=seller_data['total_payment_value'], # Size based on total_payment_value
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Plot the seller's location as a black dot with a smaller size
axes[i].scatter(
seller_data['s_geolocation_lng'].mean(),
seller_data['s_geolocation_lat'].mean(),
color='black',
s=150, # Size for seller's location
label='Seller Location'
)
# Add subtitle for each plot based on seller ranking
titles = [
"Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
"4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
"7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
]
axes[i].set_title(titles[i], fontsize=14, pad=20)
# Set labels for the axes and adjust padding
axes[i].set_xlabel('Longitude', labelpad=20)
axes[i].set_ylabel('Latitude', labelpad=20)
# Remove the top, left, and right spines
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Create a global colorbar for review score
cmap = colormaps['viridis']
norm = Normalize(vmin=global_vmin, vmax=global_vmax) # Global normalization
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes,
orientation='horizontal',
fraction=0.05,
pad=0.12
)
cbar.set_label('Review Score', fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(
top=0.85,
bottom=0.18,
hspace=0.6,
wspace=0.35
)
plt.suptitle('Customer Review Score of 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)
# Show the plot
plt.show()
Visualizing performance metrics across product categories and customer locations provides valuable insights for several reasons:
- Understanding Performance Trends by Category: Plotting metrics such as delivery times, cost efficiency, or freight share for different product categories helps businesses identify trends and outliers. This enables targeted improvements in logistics, pricing, or operational strategies to enhance overall performance in specific categories.
- Enhancing Customer Satisfaction: By analyzing metrics tied to customer locations, businesses can better understand regional variations in service quality. This information allows for proactive communication with customers, adjustments in marketing strategies, and improved alignment of services to customer expectations in different regions.
- Informed Decision-Making: Comparing metrics across categories and locations enables data-driven decisions about inventory management, supplier relationships, and logistics partnerships. Persistent challenges in certain categories or regions can prompt businesses to explore alternative solutions to optimize their operations.
- Identifying Regional Opportunities: Mapping metrics geographically reveals areas with high potential for growth or improvement. Understanding regional performance variations can guide resource allocation, pricing adjustments, and tailored strategies to meet customer demands effectively.
# @title Delivery Delay (in days) by Product Category (10 Largest Categories)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Define the mapping of product categories to English
category_mapping = {
'cama_mesa_banho': 'Bedding & Bath',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'informatica_acessorios': 'Computers & Accessories',
'moveis_decoracao': 'Furniture & Decoration',
'utilidades_domesticas': 'Household Items',
'relogios_presentes': 'Watches & Gifts',
'telefonia': 'Telephony',
'automotivo': 'Automotive',
'brinquedos': 'Toys',
'cool_stuff': 'Cool Stuff',
'ferramentas_jardim': 'Tools & Garden',
'perfumaria': 'Perfumery',
'bebes': 'Babies',
'eletronicos': 'Electronics'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(14, 12)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()
# Check if there's data for the category
if not category_data.empty:
# Get the column data we want to plot
metric = category_data['time_to_estimate_delivery_days']
# Define custom colormap from viridis, inverted to show shorter delays in darker shades
cmap = colormaps['viridis_r']
# Determine vmin and vmax for color normalization
vmin = metric.min()
vmax = metric.max()
# Ensure vmax is greater than vmin
if vmin == vmax:
vmax = vmin + 1 # Ensure there's a range for normalization
# Normalize based on range of values
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of customer locations based on delivery delay
scatter = sns.scatterplot(
x=category_data['c_geolocation_lng'],
y=category_data['c_geolocation_lat'],
hue=metric, # Use delivery delay as color metric
palette=cmap, # Custom colormap
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Add subtitle for each plot based on product category (with mapping to English)
axes[i].set_title(f'Category: {category_mapping.get(category, category)}', fontsize=14)
# Set labels for the axes
axes[i].set_xlabel('Longitude')
axes[i].set_ylabel('Latitude')
# Remove the top, left, and right spines for cleaner look
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
# Add a color bar as a legend
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Delivery Delay (days)', fontsize=12)
cbar.ax.tick_params(labelsize=10) # Change the tick label size
# Set ticks with larger intervals
tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5)) # Adjust interval size
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions]) # Format ticks as integers
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Delete the empty subplots if there are fewer than 12
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust to avoid overlapping titles
# Set main title for the entire figure
plt.suptitle('Delivery Delay (in days) by Product Category', fontsize=16)
# Show the plot
plt.show()
# @title Delivery Delay (in days) by Product Category (10 Largest Categories with Late Deliveries Only)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Define the mapping of product categories to English
category_mapping = {
'cama_mesa_banho': 'Bedding & Bath',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'informatica_acessorios': 'Computers & Accessories',
'moveis_decoracao': 'Furniture & Decoration',
'utilidades_domesticas': 'Household Items',
'relogios_presentes': 'Watches & Gifts',
'telefonia': 'Telephony',
'automotivo': 'Automotive',
'brinquedos': 'Toys',
'cool_stuff': 'Cool Stuff',
'ferramentas_jardim': 'Tools & Garden',
'perfumaria': 'Perfumery',
'bebes': 'Babies',
'eletronicos': 'Electronics'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Filter only late deliveries
late_deliveries = orders_cleaned[orders_cleaned['time_to_estimate_delivery_days'] > 0]
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(14, 12)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = late_deliveries[late_deliveries['product_category_name'] == category].copy()
# Check if there's data for the category
if not category_data.empty:
# Get the column data we want to plot
metric = category_data['time_to_estimate_delivery_days']
# Define custom colormap from viridis, inverted to show shorter delays in darker shades
cmap = colormaps['viridis_r']
# Determine vmin and vmax for color normalization
vmin = metric.min()
vmax = metric.max()
# Ensure vmax is greater than vmin
if vmin == vmax:
vmax = vmin + 1 # Ensure there's a range for normalization
# Normalize based on range of values
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of customer locations based on metric
scatter = sns.scatterplot(
x=category_data['c_geolocation_lng'],
y=category_data['c_geolocation_lat'],
hue=metric, # Use delivery delay as color metric
palette=cmap, # Custom colormap
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Add subtitle for each plot based on product category (with mapping to English)
axes[i].set_title(f'Category: {category_mapping.get(category, category)}', fontsize=14)
# Set labels for the axes
axes[i].set_xlabel('Longitude')
axes[i].set_ylabel('Latitude')
# Remove the top, left, and right spines for cleaner look
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
# Add a color bar as a legend
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Delivery delay (days)', fontsize=12)
cbar.ax.tick_params(labelsize=10) # Change the tick label size
# Set ticks with larger intervals
tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5)) # Adjust interval size
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions]) # Format ticks as integers
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Now, delete the empty subplots
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust to avoid overlapping titles
# Set main title for the entire figure
plt.suptitle('Delivery Delay (in days) by Product Category (Late Deliveries Only)', fontsize=16)
# Show the plot
plt.show()
# @title Delivery Time (in days) by Product Category (10 Largest Categories)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Define the mapping of product categories to English
category_mapping = {
'cama_mesa_banho': 'Bedding & Bath',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'informatica_acessorios': 'Computers & Accessories',
'moveis_decoracao': 'Furniture & Decoration',
'utilidades_domesticas': 'Household Items',
'relogios_presentes': 'Watches & Gifts',
'telefonia': 'Telephony',
'automotivo': 'Automotive',
'brinquedos': 'Toys',
'cool_stuff': 'Cool Stuff',
'ferramentas_jardim': 'Tools & Garden',
'perfumaria': 'Perfumery',
'bebes': 'Babies',
'eletronicos': 'Electronics'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(14, 12)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()
# Check if there's data for the category
if not category_data.empty:
# Get the column data we want to plot
metric = category_data['delivery_time_days']
# Define custom colormap from viridis, inverted to show shorter times in darker shades
cmap = colormaps['viridis_r']
# Determine vmin and vmax for color normalization
vmin = metric.min()
vmax = metric.max()
# Ensure vmax is greater than vmin
if vmin == vmax:
vmax = vmin + 1 # Ensure there's a range for normalization
# Normalize based on range of values
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of customer locations based on delivery time
scatter = sns.scatterplot(
x=category_data['c_geolocation_lng'],
y=category_data['c_geolocation_lat'],
hue=metric, # Use delivery time as color metric
palette=cmap, # Custom colormap
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Add subtitle for each plot based on product category (with mapping to English)
axes[i].set_title(f'Category: {category_mapping.get(category, category)}', fontsize=14)
# Set labels for the axes
axes[i].set_xlabel('Longitude')
axes[i].set_ylabel('Latitude')
# Remove the top, left, and right spines for a cleaner look
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
# Add a color bar as a legend
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Delivery time (days)', fontsize=12)
cbar.ax.tick_params(labelsize=10) # Change the tick label size
# Set ticks with larger intervals
tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5)) # Adjust interval size
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions]) # Format ticks as integers
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust to avoid overlapping titles
# Set main title for the entire figure
plt.suptitle('Delivery Time (in days) by Product Category', fontsize=16)
# Show the plot
plt.show()
Visualizing delivery performance across product categories based on seller locations is valuable for several reasons:
- Understanding Regional Challenges: By mapping seller locations and their performance, businesses can identify regions where logistics challenges may be affecting delivery outcomes.
- Optimizing Seller Networks: This analysis highlights the sellers filling the most orders and their geographic distribution, providing opportunities to optimize logistics support for high-performing sellers.
- Improving Resource Allocation: Insights from this visualization allow businesses to allocate resources more effectively, focusing on regions or sellers that require additional support to meet performance standards.
- Enhancing Category-Specific Strategies: Linking delivery performance to product categories enables businesses to refine category-specific logistics strategies, improving efficiency and customer satisfaction for targeted products.
# @title Delivery Time (days) by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Mapping of product categories from Portuguese to English
category_translation = {
'cama_mesa_banho': 'Bedding, Bath & Table',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'utilidades_domesticas': 'Household Items',
'moveis_decoracao': 'Furniture & Decoration',
'informatica_acessorios': 'Computers & Accessories',
'relogios_presentes': 'Watches & Gifts',
'brinquedos': 'Toys',
'telefonia': 'Telephony',
'automotivo': 'Automotive'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(14, 12)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()
# Aggregate seller information by seller_id
seller_data = category_data.groupby('seller_id').agg({
's_geolocation_lng': 'mean', # Mean longitude of seller
's_geolocation_lat': 'mean', # Mean latitude of seller
'delivery_time_days': 'mean', # Mean delivery time
'order_id': 'count' # Count of orders fulfilled by the seller
}).reset_index()
# Check if there's data for the category
if not seller_data.empty:
# Define custom colormap using inverted viridis for darker shades on shorter times
cmap = colormaps['viridis_r']
# Determine vmin and vmax for color normalization
vmin = seller_data['delivery_time_days'].min()
vmax = seller_data['delivery_time_days'].max()
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of seller locations based on delivery time
scatter = sns.scatterplot(
x=seller_data['s_geolocation_lng'],
y=seller_data['s_geolocation_lat'],
hue=seller_data['delivery_time_days'], # Use delivery time as a color metric
size=seller_data['order_id'], # Use order count to size the dots
sizes=(50, 400), # Adjust size range of seller dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm, # Normalize hue for consistent mapping
palette=cmap # Custom colormap
)
# Add subtitle for each plot based on product category (translated to English)
translated_category = category_translation.get(category, category) # Fallback to original if not found
axes[i].set_title(f'Category: {translated_category}', fontsize=14)
# Set labels for the axes
axes[i].set_xlabel('Seller Longitude')
axes[i].set_ylabel('Seller Latitude')
# Remove the top, left, and right spines for a cleaner look
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
# Add a color bar as a legend for delivery times
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Delivery Time (days)', fontsize=12)
cbar.ax.tick_params(labelsize=10) # Change the tick label size
# Set ticks with larger intervals
tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5)) # Adjust interval size
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions]) # Format ticks as integers
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Now, delete any empty subplots (if there are fewer than 12)
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust to avoid overlapping titles
# Set main title for the entire figure
plt.suptitle('Delivery Time (days) by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)
# Show the plot
plt.show()
# @title Delivery to Estimate (days) by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Mapping of product categories from Portuguese to English
category_translation = {
'cama_mesa_banho': 'Bedding, Bath & Table',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'utilidades_domesticas': 'Household Items',
'moveis_decoracao': 'Furniture & Decoration',
'informatica_acessorios': 'Computers & Accessories',
'relogios_presentes': 'Watches & Gifts',
'brinquedos': 'Toys',
'telefonia': 'Telephony',
'automotivo': 'Automotive'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(14, 12)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()
# Aggregate seller information by seller_id
seller_data = category_data.groupby('seller_id').agg({
's_geolocation_lng': 'mean', # Mean longitude of seller
's_geolocation_lat': 'mean', # Mean latitude of seller
'time_to_estimate_delivery_days': 'mean', # Mean delivery time
'order_id': 'count' # Count of orders fulfilled by the seller
}).reset_index()
# Check if there's data for the category
if not seller_data.empty:
# Define custom colormap using inverted viridis for shorter times in darker shades
cmap = colormaps['viridis_r']
# Normalize based on range of delivery time values
vmin = seller_data['time_to_estimate_delivery_days'].min()
vmax = seller_data['time_to_estimate_delivery_days'].max()
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of seller locations based on delivery time
scatter = sns.scatterplot(
x=seller_data['s_geolocation_lng'],
y=seller_data['s_geolocation_lat'],
hue=seller_data['time_to_estimate_delivery_days'], # Delivery time as color metric
size=seller_data['order_id'], # Size dots by order count
sizes=(50, 400), # Size range for order count
ax=axes[i], # Corresponding subplot
legend=False, # No individual legend
hue_norm=norm, # Normalize hue for consistency
palette=cmap # Viridis colormap
)
# Translate the category name to English
translated_category = category_translation.get(category, category) # Fallback to original if not found
axes[i].set_title(f'Category: {translated_category}', fontsize=14)
# Set labels for the axes
axes[i].set_xlabel('Seller Longitude')
axes[i].set_ylabel('Seller Latitude')
# Remove the top, left, and right spines for clarity
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
# Add a color bar for delivery times
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Delivery to Estimate (days)', fontsize=12)
cbar.ax.tick_params(labelsize=10) # Adjust tick label size
# Set ticks with larger intervals
tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5))
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions])
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95])
# Set main title for the entire figure
plt.suptitle('Delivery to Estimate (days) by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)
# Show the plot
plt.show()
# @title Cost Efficiency by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Mapping of product categories from Portuguese to English
category_translation = {
'cama_mesa_banho': 'Bedding, Bath & Table',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'utilidades_domesticas': 'Household Items',
'moveis_decoracao': 'Furniture & Decoration',
'informatica_acessorios': 'Computers & Accessories',
'relogios_presentes': 'Watches & Gifts',
'brinquedos': 'Toys',
'telefonia': 'Telephony',
'automotivo': 'Automotive'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(14, 12)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()
# Aggregate seller information by seller_id
seller_data = category_data.groupby('seller_id').agg({
's_geolocation_lng': 'mean', # Mean longitude of seller
's_geolocation_lat': 'mean', # Mean latitude of seller
'cost_efficiency': 'mean', # Mean cost efficiency
'order_id': 'count' # Count of orders fulfilled by the seller
}).reset_index()
# Check if there's data for the category
if not seller_data.empty:
# Get the cost efficiency metric
metric = seller_data['cost_efficiency']
# Use 'viridis' colormap
cmap = colormaps['viridis']
# Normalize based on range of cost efficiency values
vmin, vmax = metric.min(), metric.max()
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of seller locations based on cost efficiency
scatter = sns.scatterplot(
x=seller_data['s_geolocation_lng'],
y=seller_data['s_geolocation_lat'],
hue=metric, # Use cost efficiency as color metric
size=seller_data['order_id'], # Dot size based on order count
sizes=(50, 400), # Size range for dots
ax=axes[i], # Corresponding subplot
legend=False, # No individual legend
hue_norm=norm, # Normalize hue for consistent mapping
palette=cmap # Custom colormap
)
# Translate the category name to English
translated_category = category_translation.get(category, category)
axes[i].set_title(f'Category: {translated_category}', fontsize=14)
# Set axis labels
axes[i].set_xlabel('Seller Longitude')
axes[i].set_ylabel('Seller Latitude')
# Remove top, left, and right spines for clarity
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
# Add a color bar for cost efficiency
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Cost Efficiency', fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Set ticks with larger intervals for readability
tick_positions = np.linspace(vmin, vmax, num=5, dtype=int)
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions])
else:
# Hide empty subplot
axes[i].set_visible(False)
# Delete any unused subplots (if fewer than 12)
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95])
# Set main title for the figure
plt.suptitle('Cost Efficiency by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)
# Show the plot
plt.show()
# @title Distance Efficiency by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Mapping of product categories from Portuguese to English
category_translation = {
'cama_mesa_banho': 'Bedding, Bath & Table',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'utilidades_domesticas': 'Household Items',
'moveis_decoracao': 'Furniture & Decoration',
'informatica_acessorios': 'Computers & Accessories',
'relogios_presentes': 'Watches & Gifts',
'brinquedos': 'Toys',
'telefonia': 'Telephony',
'automotivo': 'Automotive'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(14, 12)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()
# Aggregate seller information by seller_id
seller_data = category_data.groupby('seller_id').agg({
's_geolocation_lng': 'mean', # Mean longitude of seller
's_geolocation_lat': 'mean', # Mean latitude of seller
'distance_efficiency': 'mean', # Mean distance efficiency
'order_id': 'count' # Count of orders fulfilled by the seller
}).reset_index()
# Check if there's data for the category
if not seller_data.empty:
# Get the distance efficiency metric
metric = seller_data['distance_efficiency']
# Use 'viridis' colormap
cmap = colormaps['viridis']
# Normalize based on range of distance efficiency values
vmin, vmax = metric.min(), metric.max()
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of seller locations based on distance efficiency
scatter = sns.scatterplot(
x=seller_data['s_geolocation_lng'],
y=seller_data['s_geolocation_lat'],
hue=metric, # Use distance efficiency as color metric
size=seller_data['order_id'], # Dot size based on order count
sizes=(50, 400), # Size range for dots
ax=axes[i], # Corresponding subplot
legend=False, # No individual legend
hue_norm=norm, # Normalize hue for consistent mapping
palette=cmap # Custom colormap
)
# Translate the category name to English
translated_category = category_translation.get(category, category)
axes[i].set_title(f'Category: {translated_category}', fontsize=14)
# Set axis labels
axes[i].set_xlabel('Seller Longitude')
axes[i].set_ylabel('Seller Latitude')
# Add a color bar for distance efficiency
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Distance Efficiency', fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Set ticks with larger intervals for readability
tick_positions = np.linspace(vmin, vmax, num=5, dtype=int)
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions])
else:
# Hide empty subplot
axes[i].set_visible(False)
# Delete any unused subplots (if fewer than 12)
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95])
# Set main title for the figure
plt.suptitle('Distance Efficiency by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)
# Show the plot
plt.show()
# @title Freight Share by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
# Mapping of product categories from Portuguese to English
category_translation = {
'cama_mesa_banho': 'Bedding, Bath & Table',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'utilidades_domesticas': 'Household Items',
'moveis_decoracao': 'Furniture & Decoration',
'informatica_acessorios': 'Computers & Accessories',
'relogios_presentes': 'Watches & Gifts',
'brinquedos': 'Toys',
'telefonia': 'Telephony',
'automotivo': 'Automotive'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(14, 12)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Choose colormap and invert if necessary
colormap = 'viridis'
# Invert colormap if lower values represent better performance
invert_cmap = True # Set based on metric logic
cmap = plt.cm.get_cmap(colormap + ("_r" if invert_cmap else ""))
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()
# Aggregate seller information by seller_id
seller_data = category_data.groupby('seller_id').agg({
's_geolocation_lng': 'mean', # Mean longitude of seller
's_geolocation_lat': 'mean', # Mean latitude of seller
'freight_share': 'mean', # Mean freight share
'order_id': 'count' # Count of orders fulfilled by the seller
}).reset_index()
# Check if there's data for the category
if not seller_data.empty:
# Get the column data we want to plot (mean freight share per seller)
metric = seller_data['freight_share']
# Set vmin and vmax for color normalization explicitly to 0 and 1
vmin = 0
vmax = 1
# Normalize based on range of values
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of seller locations based on metric
scatter = sns.scatterplot(
x=seller_data['s_geolocation_lng'],
y=seller_data['s_geolocation_lat'],
hue=metric, # Use freight share as a color metric
size=seller_data['order_id'], # Use order count to size the dots
sizes=(50, 400), # Increased min/max size range of seller dots based on order count
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm, # Normalize hue for consistent mapping
palette=cmap # Use Viridis colormap
)
# Add subtitle for each plot based on product category
translated_category = category_translation.get(category, category) # Fallback to original if not found
axes[i].set_title(f'Category: {translated_category}', fontsize=14)
# Set labels for the axes
axes[i].set_xlabel('Seller Longitude')
axes[i].set_ylabel('Seller Latitude')
# Add a color bar as a legend
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Freight Share', fontsize=12)
cbar.ax.tick_params(labelsize=10) # Change the tick label size
# Set ticks manually for the color bar
tick_positions = np.linspace(vmin, vmax, 6) # 6 ticks from 0 to 1
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{tick:.2f}' for tick in tick_positions]) # Format ticks to 2 decimal places
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Now, delete the empty subplots
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust to avoid overlapping titles
# Set main title for the entire figure
plt.suptitle('Freight Share by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)
# Show the plot
plt.show()
# @title Delivery Times by Largest Cities
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Define metrics for aggregation
metrics = ['order_id', 'delivery_time_days', 'late_delivery', 'total_payment_value', 'freight_share']
# Group by city and calculate the mean for each metric
grouped = orders_cleaned.groupby('city')[metrics].agg(
{'order_id': 'count', 'delivery_time_days': 'mean', 'late_delivery': 'mean', 'total_payment_value': 'mean', 'freight_share': 'mean'}
).reset_index().rename(columns={'order_id': 'count'})
# Set up plot style and font properties
plt.rcParams.update({
"font.family": "serif",
"axes.titlesize": 14,
"axes.labelsize": 12,
"xtick.labelsize": 10,
"ytick.labelsize": 10,
})
# Convert the color palette to a list and limit the colors to the number of cities
viridis_colors = list(plt.cm.viridis(np.linspace(0, 1, len(grouped['city'].unique()))))
# Create four bar subplots with custom formatting
plt.figure(figsize=(14, 10))
for i, metric in enumerate(metrics[1:]):
ax = plt.subplot(2, 2, i+1)
sns.barplot(data=grouped, x='city', y=metric, hue="city", palette=viridis_colors[:len(grouped['city'].unique())], legend=False)
ax.set_title(metric.replace('_', ' ').capitalize())
ax.set_xlabel('City')
ax.set_ylabel(metric.replace('_', ' ').capitalize())
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# @title Key Sales Metric Comparison Across Cities - Heatmaps
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming 'orders_cleaned' is already labeled with cities
# Step 1: Group by city and product category, including additional metrics
grouped = orders_cleaned.groupby(['city', 'product_category_name']).agg(
order_count=('order_id', 'count'),
avg_delivery_time=('delivery_time_days', 'mean'),
late_delivery_count=('late_delivery', lambda x: (x == 1).sum()),
freight_cost_per_km=('freight_cost_per_km', 'mean'),
price_to_distance=('price_to_distance', 'mean'),
cost_efficiency=('cost_efficiency', 'mean'),
order_size_per_km=('order_size_per_km', 'mean'),
total_order_cost_per_km=('total_order_cost_per_km', 'mean'),
freight_share=('freight_share', 'mean'),
review_score=('review_score', 'mean')
).reset_index()
# Step 2: Calculate delivery delay percentage
grouped['delivery_delay_percentage'] = (grouped['late_delivery_count'] / grouped['order_count']) * 100
# Step 3: Get top 10 product categories based on order count
top_10_product_categories = grouped.groupby('product_category_name')['order_count'].sum().nlargest(10).index.tolist()
# Filter grouped data for top 10 product categories
grouped_top_10 = grouped[grouped['product_category_name'].isin(top_10_product_categories)]
# Step 5: Create a mapping for product categories
category_mapping = {
'automotivo': 'Automotive',
'beleza_saude': 'Beauty & Health',
'brinquedos': 'Toys',
'cama_mesa_banho': 'Bedding, Table & Bath',
'esporte_lazer': 'Sports & Leisure',
'informatica_acessorios': 'Computers & Accessories',
'moveis_decoracao': 'Furniture & Decoration',
'relogios_presentes': 'Watches & Gifts',
'telefonia': 'Telephony',
'utilidades_domesticas': 'Household Items'
}
# Step 6: Metrics and formatting information
metrics = {
'order_count': ('Order Count', 0),
'avg_delivery_time': ('Average Delivery Time (Days)', 2),
'delivery_delay_percentage': ('Delivery Delay Percentage (%)', 2),
'freight_cost_per_km': ('Freight Cost per Km', 2),
'price_to_distance': ('Price to Distance', 2),
'cost_efficiency': ('Cost Efficiency', 2),
'order_size_per_km': ('Order Size per Km', 2),
'total_order_cost_per_km': ('Total Order Cost per Km', 2),
'freight_share': ('Freight Share', 2),
'review_score': ('Review Score', 2)
}
# Define color map notes for matrix-wide and column-wise scaling
matrix_note = "*Color map applied for whole matrix, comparing all cities and product categories"
column_note = "*Color map applied column-wise, comparing cities per product category"
# Step 7: Loop through each metric, create pivot tables, and display as heatmaps with appropriate scaling
for i, (metric, (title, decimal_places)) in enumerate(metrics.items()):
# Create a pivot table
pivot_table = grouped_top_10.pivot(index='city', columns='product_category_name', values=metric).fillna(0)
pivot_table.rename(columns=category_mapping, inplace=True)
pivot_table.columns.name = None # Flatten the MultiIndex
# Decide on normalization type and set appropriate note
if i < 2: # For the first two metrics, normalize across the entire matrix
normed_pivot = (pivot_table - pivot_table.values.min()) / (pivot_table.values.max() - pivot_table.values.min())
color_map_note = matrix_note
else: # For other metrics, normalize each column (product category) independently for column-wise coloring
normed_pivot = pivot_table.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
color_map_note = column_note
# Plot the heatmap with normalized values and original values as annotations
plt.figure(figsize=(12, 8))
ax = sns.heatmap(
normed_pivot,
cmap='viridis',
annot=pivot_table.round(decimal_places), # Use original values for annotations
fmt=f".{decimal_places}f",
cbar_kws={'label': title}
)
plt.title(f"{title} by City and Product Category")
plt.xlabel("Product Category")
plt.ylabel("City")
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
# Adjust the colorbar position
plt.subplots_adjust(right=0.85)
colorbar = ax.collections[0].colorbar
# Place the color map note slightly to the right and center it vertically with the color bar
colorbar.ax.text(
3.0, # Adjust to move right
0.5, # Center vertically
color_map_note,
fontsize=10,
color='black',
ha='left',
va='center', # Center alignment vertically
transform=colorbar.ax.transAxes
)
plt.show()
# @title Top Sellers with Key Metrics - Heatmap
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming 'orders_cleaned' includes all necessary seller-level data
# Step 1: Group by seller and aggregate metrics
grouped_sellers = orders_cleaned.groupby('seller_id').agg(
order_count=('order_id', 'count'),
avg_order_size=('total_payment_value', 'mean'),
avg_delivery_time=('delivery_time_days', 'mean'),
avg_freight_share=('freight_share', 'mean'),
freight_cost_per_km=('freight_cost_per_km', 'mean'),
price_to_distance=('price_to_distance', 'mean'),
cost_efficiency=('cost_efficiency', 'mean'),
order_size_per_km=('order_size_per_km', 'mean'),
total_order_cost_per_km=('total_order_cost_per_km', 'mean'),
late_delivery_count=('late_delivery', lambda x: (x == 1).sum()),
review_score=('review_score', 'mean')
).reset_index()
# Step 2: Calculate delivery delay percentage
grouped_sellers['delivery_delay_percentage'] = (grouped_sellers['late_delivery_count'] / grouped_sellers['order_count']) * 100
# Step 3: Sort sellers by order count and select top 30 sellers
top_sellers = grouped_sellers.sort_values(by='order_count', ascending=False).head(30)
# Step 4: Replace seller_id with rank based on order count
top_sellers['Rank'] = range(1, len(top_sellers) + 1)
top_sellers.set_index('Rank', inplace=True)
# Step 5: Select metrics for heatmap
metrics = [
'order_count', 'avg_order_size', 'avg_delivery_time', 'avg_freight_share',
'freight_cost_per_km', 'price_to_distance', 'cost_efficiency', 'order_size_per_km',
'total_order_cost_per_km', 'delivery_delay_percentage', 'review_score'
]
pivot_data = top_sellers[metrics]
# Define which metrics are "negative" where higher values are not preferred
negative_metrics = ['avg_delivery_time', 'avg_freight_share', 'freight_cost_per_km',
'price_to_distance', 'total_order_cost_per_km', 'delivery_delay_percentage']
# Step 6: Normalize each column for per-column color scaling, inverting where higher values are not beneficial
normed_data = pivot_data.apply(lambda x: (x.max() - x) / (x.max() - x.min()) if x.name in negative_metrics
else (x - x.min()) / (x.max() - x.min()), axis=0)
# Step 7: Format data for annotation
formatted_data = pivot_data.copy()
formatted_data['order_count'] = pivot_data['order_count'].astype(int) # Format as integer for order_count
formatted_data = formatted_data.apply(lambda x: x.map('{:.2f}'.format) if x.name != 'order_count' else x.map('{:.0f}'.format))
# Step 8: Plot the heatmap with per-column scaling
plt.figure(figsize=(12, 8))
sns.heatmap(
normed_data,
cmap='viridis',
annot=formatted_data, # Display formatted values
fmt="", # Disable scientific notation
cbar_kws={'label': 'Normalized Scale (Per Column)'}
)
# Add title and labels
plt.title("Top 30 Sellers with Key Metrics - Heatmap")
plt.xlabel("Metric")
plt.ylabel("Largest Sellers (Rank)")
# Customize tick label rotation
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
# Adjust layout
plt.tight_layout()
plt.show()
# @title Delivery Performance Metrics by Product Category On-Time vs. Late Deliveries
# Step 1: Aggregate by product category and late delivery (with 'order_count' as sum)
aggregated_data = orders_cleaned.groupby(['product_category_name', 'late_delivery']).agg(
order_count=('order_id', 'sum'), # sum for order_count
avg_order_size=('total_payment_value', 'mean'),
avg_delivery_time=('delivery_time_days', 'mean'),
avg_freight_share=('freight_share', 'mean'),
freight_cost_per_km=('freight_cost_per_km', 'mean'),
price_to_distance=('price_to_distance', 'mean'),
cost_efficiency=('cost_efficiency', 'mean'),
order_size_per_km=('order_size_per_km', 'mean'),
total_order_cost_per_km=('total_order_cost_per_km', 'mean'),
review_score=('review_score', 'mean')
).reset_index()
# Ensure 'order_count' is numeric
aggregated_data['order_count'] = pd.to_numeric(aggregated_data['order_count'], errors='coerce').fillna(0)
# Step 2: Calculate the top 10 categories by total order count and filter the data
top_10_categories = aggregated_data.groupby('product_category_name')['order_count'].sum().nlargest(10).index.tolist()
filtered_data = aggregated_data[aggregated_data['product_category_name'].isin(top_10_categories)]
# Step 3: Pivot the DataFrame with delivery status as the second level of columns
pivoted_data = filtered_data.pivot_table(
index='product_category_name',
columns='late_delivery',
values=[
'avg_order_size',
'avg_delivery_time',
'avg_freight_share',
'freight_cost_per_km',
'price_to_distance',
'cost_efficiency',
'order_size_per_km',
'total_order_cost_per_km',
'review_score'
],
aggfunc='mean'
)
# Step 4: Rename columns for clarity, avoiding `KeyError`
pivoted_data.columns = pd.MultiIndex.from_tuples(
[(metric, 'On-Time' if late_deliv == 0 else 'Late') for metric, late_deliv in pivoted_data.columns],
names=["Metric", "Delivery Status"]
)
# Step 5: Reset index to bring 'product_category_name' into the DataFrame
pivoted_data.reset_index(inplace=True)
# Step 6: Translate product category names to English
translation_mapping = {
'cama_mesa_banho': 'Bedding and Bath',
'beleza_saude': 'Beauty and Health',
'esporte_lazer': 'Sports and Leisure',
'utilidades_domesticas': 'Household Utilities',
'informatica_acessorios': 'Computers and Accessories',
'moveis_decoracao': 'Furniture and Decoration',
'relogios_presentes': 'Watches and Gifts',
'brinquedos': 'Toys',
'telefonia': 'Telephony',
'automotivo': 'Automotive'
}
pivoted_data['product_category_name'] = pivoted_data['product_category_name'].map(translation_mapping).fillna(pivoted_data['product_category_name'])
# Step 7: Order product categories by total order count
total_order_counts = aggregated_data.groupby('product_category_name')['order_count'].sum()
sorted_categories = total_order_counts[top_10_categories].sort_values(ascending=False).index
pivoted_data['product_category_name'] = pd.Categorical(
pivoted_data['product_category_name'], categories=sorted_categories, ordered=True
)
pivoted_data = pivoted_data.sort_values('product_category_name')
# Step 8: Reshape data to prepare for heatmap
final_table = pivoted_data.set_index('product_category_name').stack(level=0).unstack(level=1)
# Step 9: Separate data for On-Time and Late deliveries
on_time_data = final_table.xs('On-Time', level='Delivery Status', axis=1)
late_data = final_table.xs('Late', level='Delivery Status', axis=1)
# Step 10: Normalize each column independently for column-wise color scaling
on_time_normalized = on_time_data.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
late_normalized = late_data.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
# Step 11: Plot heatmaps for On-Time and Late deliveries side by side with column-wise normalization
plt.figure(figsize=(14, 10))
sns.heatmap(on_time_normalized, cmap='viridis', annot=on_time_data, fmt=".2f", cbar=True)
plt.title("Delivery Performance Metrics by Product Category - On-Time Deliveries")
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(14, 10))
sns.heatmap(late_normalized, cmap='viridis', annot=late_data, fmt=".2f", cbar=True)
plt.title("Delivery Performance Metrics by Product Category - Late Deliveries")
plt.xticks(rotation=45)
plt.show()
# @title Correlation matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Define the columns for the correlation matrix
cols = [
'processing_share', 'carrier_share', 'customer_share', 'price',
'freight_value', 'payment_installments', 'total_payment_value',
'distance_km', 'distance_efficiency', 'freight_cost_per_km',
'price_to_distance', 'cost_efficiency', 'order_size_per_km',
'total_order_cost_per_km', 'freight_share', 'product_weight_g',
'product_length_cm', 'product_height_cm', 'product_width_cm',
'product_vol', 'review_score', 'delivery_time_days', 'order_processing_time_days',
'carrier_delivery_time_days', 'customer_delivery_time_days', 'time_to_estimate_delivery_days'
]
# Calculate the correlation matrix
corr_matrix = orders_cleaned[cols].corr()
# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
# Further increase the size of the heatmap and format annotations
plt.figure(figsize=(14, 12)) # Larger figure size for better readability
# Create the heatmap object with Viridis color palette, limited decimal places, and increased font size
heatmap = sns.heatmap(
corr_matrix,
annot=True,
fmt=".2f", # Limit annotations to two decimal places
annot_kws={"size": 10}, # Further increase annotation font size
cmap="viridis", # Set the colormap to Viridis
mask=mask,
square=True,
cbar_kws={"shrink": .75}
)
# Set the title of the heatmap
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize': 16}, pad=16)
# Show the plot
plt.show()
# @title Key Performance Metrics for Late vs. On-Time Deliveries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set the color palette to Viridis for better visibility
colors = sns.color_palette("viridis", 2) # Using two shades from the Viridis palette
# Create a copy of the DataFrame to ensure we're working on a clean dataset
df_copy = orders_cleaned.copy()
# Create a binary column for late deliveries
df_copy['late_delivery'] = df_copy['time_to_estimate_delivery_days'].apply(lambda x: 1 if x > 0 else 0)
# Step 1: Get the top 10 product categories based on total delivery counts
top_10_categories = df_copy['product_category_name'].value_counts().nlargest(10).index
# List of metrics to plot
metrics = [
'total_payment_value', 'price', 'freight_value', 'distance_km',
'review_score', 'distance_efficiency', 'freight_cost_per_km',
'price_to_distance', 'cost_efficiency', 'order_size_per_km',
'total_order_cost_per_km', 'freight_share'
]
# Category mapping for English translation
category_mapping = {
'cama_mesa_banho': 'Bedding, Table, and Bath',
'beleza_saude': 'Beauty and Health',
'esporte_lazer': 'Sports and Leisure',
'utilidades_domesticas': 'Household Utilities',
'moveis_decoracao': 'Furniture and Decoration',
'informatica_acessorios': 'Computers and Accessories',
'relogios_presentes': 'Watches and Gifts',
'brinquedos': 'Toys',
'telefonia': 'Telephony',
'automotivo': 'Automotive'
}
# Step 2: Handle infinite values before plotting
for metric in metrics:
# Calculate mean and standard deviation for the metric
mean_value = df_copy[metric].mean()
std_value = df_copy[metric].std()
# Replace inf values with NaN
df_copy[metric] = df_copy[metric].replace([np.inf, -np.inf], np.nan)
# Replace -inf with -3 * std and +inf with +3 * std
df_copy[metric] = np.where(df_copy[metric] == np.inf, mean_value + 3 * std_value, df_copy[metric])
df_copy[metric] = np.where(df_copy[metric] == -np.inf, mean_value - 3 * std_value, df_copy[metric])
# Step 3: Create subplots for each metric
fig, axs = plt.subplots(nrows=len(metrics), ncols=1, figsize=(14, 6 * len(metrics)))
# Iterate over each metric
for i, metric in enumerate(metrics):
# Group by product_category_name and late deliveries to calculate mean for the current metric
mean_values = df_copy.groupby(['product_category_name', 'late_delivery'])[metric].mean().reset_index()
# Filter the mean values for only the top 10 categories
mean_values = mean_values[mean_values['product_category_name'].isin(top_10_categories)]
# Reshape the DataFrame for easier plotting
mean_values_long = mean_values.pivot(index='product_category_name',
columns='late_delivery',
values=metric).reset_index()
# Rename columns for clarity
mean_values_long.columns = ['product_category_name', 'On Time', 'Late']
# Map the product category names to English
mean_values_long['product_category_name'] = mean_values_long['product_category_name'].map(category_mapping)
# Create the bar plot
mean_values_long.set_index('product_category_name').plot(kind='bar', ax=axs[i], color=colors)
# Adding titles and labels
axs[i].set_title(f'Mean {metric.replace("_", " ").title()} for Top 10 Product Categories')
axs[i].set_xlabel('Product Categories')
axs[i].set_ylabel(f'Mean {metric.replace("_", " ").title()}')
axs[i].set_xticks(range(len(mean_values_long['product_category_name'])))
axs[i].set_xticklabels(mean_values_long['product_category_name'], rotation=45)
axs[i].legend(title='Delivery Status', labels=['On Time', 'Late'])
# Remove the top, left, and right spines
axs[i].spines['top'].set_visible(False)
axs[i].spines['left'].set_visible(False)
axs[i].spines['right'].set_visible(False)
# Adjust layout to prevent overlap
plt.tight_layout()
# Show the plot
plt.show()